# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
import os
import sys
# add the 'src' directory as one where we can import modules
root_dir = os.path.join(os.getcwd(),os.pardir,os.pardir)
src_dir = os.path.join(root_dir, 'src')
if src_dir not in sys.path: sys.path.append(src_dir)
import math
import copy as cp
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.style
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans,Birch,AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn import preprocessing
from scipy.cluster import hierarchy
from scipy import stats
from scipy.stats import mstats
import helpers as hlp
%aimport helpers
from external import kMedoids
from IPython.display import display
#printing
pd.options.display.float_format = '{:,.2f}'.format
raw_path = os.path.join(root_dir,"data\\raw\\")
interim_path = os.path.join(root_dir,"data\\interim\\")
processed_path = os.path.join(root_dir,"data\\processed\\")
reports_path = os.path.join(root_dir,"reports\\")
models_path = os.path.join(root_dir,"models\\")
raw_file_name ="bnd_product_p2c4_raw.csv"
clean_file_name = "bnd_product_p2c4_clean.csv"
z_file_name ="bnd_product_z_p2c4_clean.csv"
row_headers = ['Product','Client']
n_row_headers = len(row_headers)
product_raw_df = pd.read_csv(interim_path+raw_file_name, sep=';', encoding='utf-8')
product_df = pd.read_csv(interim_path+clean_file_name, sep=';', encoding='utf-8')
product_df_full = pd.read_csv(processed_path+z_file_name, sep=';', encoding='utf-8')
X_z = product_df_full.values[:,n_row_headers:]
nb_col = X_z.shape[1]
X_pca = PCA(n_components = nb_col).fit_transform(X_z)
product_df_full.head()
Try out Hierarchical clustering, kMeans and kMedodis on raw (cleaned) data. Then, plot the PCA to visualize the result of the clustering on the principal components
from scipy.cluster import hierarchy
SSE = {}
SILOUHAITE = {}
Z = hierarchy.linkage(X_z, method='complete',metric='euclidean')
dn = hierarchy.dendrogram(Z, truncate_mode='lastp', p=50, leaf_rotation=90., leaf_font_size=7., show_contracted=True)
plt.show()
plt.figure()
labels_h_cc = hierarchy.fcluster(Z, t=100 ,criterion = 'maxclust')
hlp.Clusters_plot(X= X_pca, labels = labels_h_cc,info=["AgglomerativeClustering","Complete - euclidean ","%d clusters"%len(set(labels_h_cc))])
SSE["Agg_complete"] = hlp.getSSE(X_z,X_z[labels_h_cc])
last = Z[-150:, 2]
last_rev = last[::-1]
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)
acceleration = np.diff(last, 2) # 2nd derivative of the distances
acceleration_rev = acceleration[::-1]
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.xticks(np.arange(1,len(last)))
plt.show()
best_ks = np.abs(acceleration_rev).argsort()[::-1]
k = best_ks+ 2 # if idx 0 is the max of this we want 2 clusters
print ("clusters:", k)
%matplotlib inline
clusters= np.linspace(70,120,25).astype(int)
inertia = []
silouhaite = []
for cluster in clusters:
kmeans = KMeans(n_clusters=cluster).fit(X_z)
silouhaite.append(hlp.getSilouhaite(X_z,kmeans.labels_))
inertia += [np.sqrt(kmeans.inertia_/len(kmeans.labels_))]
plt.figure(figsize=(16,4))
plt.subplot(1,2,1)
inertia = np.array(inertia)
plt.title("Total inertia according to clusters")
plt.plot(np.arange(0,len(clusters)),inertia)#scale it to acc2
plt.xticks(np.arange(0,len(clusters)),clusters)
acc = np.diff(inertia, 2) # 2nd derivative of the inertia curve
#plt.plot(np.arange(2,len(clusters)), acc)
best_ks = acc.argsort()[::-1]
k = best_ks+ 2 # if idx 0 is the max of this we want 2 clusters
print ("clusters:",clusters[k])
plt.subplot(1,2,2)
silouhaite = np.array(silouhaite)
plt.title("Silouhaite score according to clusters")
plt.plot(np.arange(0,len(clusters)),silouhaite)#scale it to acc2
plt.xticks(np.arange(0,len(clusters)),clusters)
best_ks = silouhaite.argsort()[::-1]
print("clusters:",clusters[best_ks])
plt.show()
n_cluster = 113
from sklearn.cluster import AgglomerativeClustering
ward = AgglomerativeClustering(n_clusters=n_cluster, linkage='ward').fit(X_z)
label = ward.labels_
SSE['Ward'] = hlp.getSSE(X_z,X_z[label])
print(SSE['Ward'])
hlp.Clusters_plot(X= X_pca, labels = label,info=["AgglomerativeClustering","Ward","%d clusters"%len(set(label))])
%matplotlib inline
kmeans = KMeans(n_clusters=n_cluster).fit(X_z)
label = kmeans.labels_
labels_kmeans = label
SSE["kMeans"] = hlp.getSSE(X_z,X_z[labels_kmeans])
X = X_pca[:,:3]
hlp.Clusters_plot(X= X_pca, labels = label,info=["K-Means","Euclidean","%d clusters"%len(set(label))])
from scipy.stats import spearmanr
def spearmanr_dist(x,y):
rho, pval = spearmanr(x,y)
return rho
r,p = spearmanr(X_z)
np.fill_diagonal(r,0)
from external import kMedoids
from scipy.spatial.distance import pdist,squareform
n_obs = X_z.shape[1]
corr_distance = squareform(pdist(X_z, 'correlation'))
euclid_distance = squareform(pdist(X_z, 'euclidean'))
sqcorr_distance = corr_distance**2
#spearman_distance = squareform(pdist(X_z, lambda u, v: spearmanr_dist(u,v)))
%matplotlib inline
clusters= np.linspace(70,115,25).astype(int)
silouhaite = []
inertia = []
for cluster in clusters:
labels, medoids = kMedoids.cluster(euclid_distance,k= cluster)
silouhaite.append(hlp.getSilouhaite(X_z,labels))
sse = hlp.getSSE(X_z,X_z[labels])
inertia.append(np.sqrt(sse/len(labels_kmedoids)))
plt.figure(figsize=(16,4))
plt.subplot(1,2,1)
inertia = np.array(inertia)
plt.title("Total inertia according to clusters")
plt.plot(np.arange(0,len(clusters)),inertia)#scale it to acc2
plt.xticks(np.arange(0,len(clusters)),clusters)
acc = np.diff(inertia, 2) # 2nd derivative of the inertia curve
#plt.plot(np.arange(2,len(clusters)), acc)
best_ks = acc.argsort()[::-1]
k = best_ks+ 2 # if idx 0 is the max of this we want 2 clusters
print ("clusters:",clusters[k])
plt.subplot(1,2,2)
silouhaite = np.array(silouhaite)
plt.title("Silouhaite score according to clusters")
plt.plot(np.arange(0,len(clusters)),silouhaite)#scale it to acc2
plt.xticks(np.arange(0,len(clusters)),clusters)
best_ks = silouhaite.argsort()[::-1]
print("clusters:",clusters[best_ks])
n_cluster = 113
label, medoids_euc = kMedoids.cluster(euclid_distance,k= n_cluster)
labels_kmedoids = label
labels_kmedoids_corr,medoids_corr = kMedoids.cluster(corr_distance,k= n_cluster)
labels_kmedoids_spear,medoids_spear = kMedoids.cluster(corr_distance,k= n_cluster)
SSE["kMedoids"] = hlp.getSSE(X_z,X_z[labels_kmedoids])
SSE["kMedoids_corr"] = hlp.getSSE(X_z,X_z[labels_kmedoids_corr])
SSE["kMedoids_spear"] = hlp.getSSE(X_z,X_z[labels_kmedoids_spear])
SILOUHAITE["kMedoids"] = hlp.getSilouhaite(X_z,labels_kmedoids)
SILOUHAITE["kMedoids_corr"] = hlp.getSilouhaite(X_z,labels_kmedoids_corr)
SILOUHAITE["kMedoids_spear"] = hlp.getSilouhaite(X_z,labels_kmedoids_spear)
hlp.Clusters_plot(X= X_pca, labels = label
,info=["PCA K-Medoids","Euclidienne","%d clusters :inertia %.2f"%(len(set(label)),SSE["kMedoids"])])
#hlp.Clusters_plot(X= X_tsne, labels = label,info=["TSNE K-Medoids","Correlation","%d clusters"%len(set(label))])
X_tsne = TSNE(n_components = 3).fit_transform(X_z)
plt.figure(figsize=(14,6))
colors = [str(item/255.) for item in labels_kmedoids]
plt.suptitle("Total inertia %.02f"%kmeans.inertia_)
plt.subplot(1,2,1)
plt.scatter(X_tsne[:,0],X_tsne[:,1],cmap ="Paired" ,c=colors,s=20)
plt.subplot(1,2,2)
plt.scatter(X_tsne[:,0],X_tsne[:,2],cmap ="Paired" ,c=colors,s=20)
plt.show(block = True)
for k,v in SSE.items():
print(" \"%s\" : %.2f"%(k,v))
print()
for k,v in SILOUHAITE.items():
print(" \"%s\" : %.2f"%(k,v))
processed_path = "..\\data\\processed\\"
file_name = "p2c4_clustering_clean.csv"
version = 6
file_name = "p2c4_clustering_clean_week_v%d.csv"%version
def labels_to_df(labels):
medoid_cluster_dict = dict()
medoids = list(set(labels))
for i,l in enumerate(medoids):
medoid_cluster_dict[l] = i+1
pd_tuples_list = list(product_df_full[row_headers].itertuples(index=False))
headers_list = [tuple(x) for x in pd_tuples_list]
rows=[]
for i,h in enumerate(headers_list):
m = labels[i]
rows.append([h[0],h[1],medoid_cluster_dict[m],"%s"%(headers_list[m],)])
label_df = pd.DataFrame(rows,columns = row_headers + ["Cluster","Centroid"])
return label_df
eucl_df = labels_to_df(labels_kmedoids)
corr_df = labels_to_df(labels_kmedoids_corr)
spear_df = labels_to_df(labels_kmedoids_spear)
eucl_df.to_csv(models_path+"euc_"+file_name, sep=';', encoding='utf-8')
corr_df.to_csv(models_path+"corr_"+file_name, sep=';', encoding='utf-8')
spear_df.to_csv(models_path+"spear_"+file_name, sep=';', encoding='utf-8')
print(eucl_df.shape)
eucl_df.head()
carr = hlp.Cluster_series_plot(data_df = product_df_full, cluster_df = eucl_df,headers = row_headers)